This script takes the geotagged tweets and assigns each unique user a user type, either as a local or a tourist. If the user has self-identified their location as somewhere in the Santa Barbara area, they are designated a local. This includes Carpinteria, Santa Barbara, Montecito, Goleta, Gaviota and UCSB. For the remainder, we use the number of times they have tweeted from Santa Barbara within a year to designate user type. If someone has tweeted across more than 2 months in the same year from Santa Barbara, they are identified as a local. This is consistent with how Eric Fischer determined tourists in his work. This is not fool-proof and there are instances were people visit and tweet from Santa Barbara more than two months a year, especially if they are visiting family or live within a couple hours driving distance.
knitr::opts_chunk$set(echo = TRUE, message = F, warning = F)
library(tidyverse)
library(leaflet)
library(RColorBrewer)
library(ggmap)
library(viridis)
library(sf)
library(mapview)
#remotes::install_github("wilkelab/ggtext")
library(ggtext)
register_google(Sys.getenv("GOOGLE_ACCESS_TOKEN"))
Think of another way to identify tourists and locals. Using Eric Fisher’s methods, we identify a local as someone who has tweeted from the Santa Barbara area within a month range. A tourist is less likely to tweet multiple times within a month.
tweet_df <- read_csv("../data/geotagged_sb_tweets_post_apr_2015.csv")
num_month_tweets <- tweet_df %>%
group_by(user_id, user_location, Year) %>%
summarize(num_unique_months_tweeted = length(unique(month_num))) %>%
ungroup()
We have 67231 tweets from 14353 people.
What we really want to do is use the number of tweets in a month/year range to tell us whether or not someone is a local or tourist. We can do this across the entire dataset, and then compare to our user location designation to see how right or wrong we were.
Our assumption is that a locals tweets will occur across multiple months within a single year, whereas a tourist is most likely to only tweet within a month (when visiting) or multiple visits across multiple months/years.
First, if folk self-identify as from the Santa Barbara area in their user location, we assign them locals, otherwise unknown.
Then we take the unknowns and apply a couple rules. 1. if they have tweeted in more than 2 months within the same year they are local 2. if they have only tweeted in 1 or 2 months within a year they are a tourist
tourist_local <- num_month_tweets %>%
group_by(user_id, user_location) %>%
mutate(user_type = case_when(
str_detect(tolower(user_location), paste(c("santa barbara", "carpinteria", "isla vista", "goleta", "montecito", "805", "gaviota", "ucsb"), collapse = '|')) ~ "local",
TRUE ~ "unknown"))
ggplot(tourist_local) +
geom_histogram(aes(num_unique_months_tweeted), bins = 12) +
theme_minimal() +
scale_x_continuous(breaks = seq(0, 12, 1), lim = c(0, 12)) +
facet_wrap(~user_type, scales = 'free') +
labs(x = "Number of months tweeted in a year",
y = "Number of users",
title = "How many unique months are SB geotag tweeters tweeting?")
ggsave("../figs/user_threshold.png")
Based on this lets stick with the 3 month threshold.
tourist_local <- num_month_tweets %>%
group_by(user_id, user_location) %>%
mutate(user_type = case_when(
str_detect(tolower(user_location), paste(c("santa barbara", "carpinteria", "isla vista", "goleta", "montecito", "805", "gaviota", "ucsb"), collapse = '|')) ~ "local",
TRUE ~ "unknown")) %>%
mutate(user_type = case_when(
user_type == "unknown" & num_unique_months_tweeted > 2 ~ "local", #if someone has tweeted in at least 2 months of a single year we will id them as local
user_type == "unknown" & num_unique_months_tweeted <= 2 ~ "tourist",
TRUE ~ as.character(user_type)
)) %>%
select(-num_unique_months_tweeted, -Year) %>%
distinct()
The way we assigned “tourist” or “local” by year gives us some cases where in one year. For those users that were identified as a tourist in one year and a local in another, we will default and call them all local.
# show me the duplicates
dups <- tourist_local$user_id[duplicated(tourist_local$user_id)]
tourist_local_update <- tourist_local %>%
mutate(user_type = ifelse(user_id %in% dups, "local", user_type)) %>%
distinct()
tweet_df_w_user_type <- tweet_df %>%
left_join(tourist_local_update)
There are 21811 tweets from tourists and 45420 tweets from locals.
Save
write_csv(tweet_df_w_user_type, "../data/geotag_sb_tweets_user_type.csv")
Turn the tweet_df_w_user_type data frame into a spatial object.
tweet_df_w_user_type_sf <- tweet_df_w_user_type %>%
st_as_sf(coords = c("lon", "lat"), remove = F) %>%
st_set_crs(4326)
static_df <- tweet_df_w_user_type_sf %>%
select(user_type, geometry, lat, lon)
#santa barbara
sb.map <- get_map("santa barbara, california", zoom = 14, maptype = "toner-lite")
ggmap(sb.map, legend="none") +
coord_equal() +
labs(x = NULL, y = NULL) +
theme(axis.text = element_blank()) +
geom_point(data = static_df, aes(x = lon, y = lat, color = user_type),
size = 0.5, alpha = 0.3) +
scale_color_manual(values = c("red", "blue")) +
labs(fill = "User type",
title = "Santa Barbara tweets from <b style='color:#FF0000'>locals</b> and <b style='color:#0000FF'>tourists</b>",
subtitle = "2015 - 2019") +
theme(plot.title = element_markdown(lineheight = 1.1),
plot.subtitle = element_markdown(lineheight = 1.1),
legend.position = "none")
Get hex density by overlaying with points
hex_grid <- read_sf("../data/sb_area_hexagons.shp") %>%
st_transform(st_crs(tweet_df_w_user_type_sf))
locals <- tweet_df_w_user_type_sf %>% filter(user_type == "local")
tourists <- tweet_df_w_user_type_sf %>% filter(user_type == "tourist")
hex_tweet_count <- hex_grid %>%
mutate(local_tweet_count = lengths(st_intersects(hex_grid, locals)),
local_log_tweet_count = log(local_tweet_count),
tourist_tweet_count = lengths(st_intersects(hex_grid, tourists))) %>%
mutate(
tourist_log_tweet_count = log(tourist_tweet_count),
total = local_tweet_count + tourist_tweet_count,
diff = local_tweet_count - tourist_tweet_count)
#color palettes
blues = colorRampPalette(c("#DEEBF7", "#08306B"))
reds = colorRampPalette(c("#FEE0D2", "#67000D"))
mapview(hex_tweet_count %>% filter(local_tweet_count > 0),
zcol = "local_log_tweet_count",
layer.name = "# tweets by locals (log)",
col.regions = blues,
alpha.regions = 0.5) +
mapview(hex_tweet_count %>% filter(tourist_tweet_count > 0),
zcol = "tourist_log_tweet_count",
layer.name = "# tweets by tourists (log)",
col.regions = reds,
alpha.regions = 0.5)
We want to know where tourists go that locals do not and vice versa.
#where tourists go more
t <- hex_tweet_count %>%
filter(diff < 0) %>%
mutate(value = -1*diff,
log_value = log10(value))
l <- hex_tweet_count %>%
filter(diff > 0) %>%
mutate(log_value = log10(diff))
#map
tl_map <- mapview(t, zcol = "log_value", layer.name = "Tourist preferred # tweets",
col.regions = colorRampPalette(brewer.pal(9, "Purples")[4:9]), alpha.regions = 1) +
mapview(l, zcol = "log_value", layer.name = "Local preferred # tweets",
col.regions = colorRampPalette(brewer.pal(9, "Oranges")[4:9]), alpha.regions = 1)
tl_map@map %>% setView(lng = -119.714, lat = 34.426, zoom = 13)